%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
import pandas as pd
import warnings
import plotly.graph_objects as go
import plotly.express as px
warnings.filterwarnings('ignore')
train = pd.read_csv("dataset.csv")
test = pd.read_csv("test_set_no_target_column.csv")
print("train set shape:", train.shape)
print("test set shape:", test.shape)
train set shape: (2530, 4) test set shape: (1000, 4)
train.sample(10)
| seconds_processing | MB_data_processed | dollars_run_cost | job_result | |
|---|---|---|---|---|
| 929 | 274.726125 | 75474.443566 | 76.373708 | failed |
| 1903 | 345.888818 | 84417.330846 | 87.701881 | successful |
| 650 | 512.713507 | 515235.274906 | 114.089188 | successful |
| 843 | 441.568623 | 227427.995044 | 104.312941 | successful |
| 1874 | 442.876117 | 352187.646837 | 88.087346 | successful |
| 1837 | 416.328753 | 187473.328704 | 83.722629 | successful |
| 1151 | 527.565364 | 368085.094370 | 112.252713 | failed |
| 1580 | 322.070187 | 116550.134992 | 75.765615 | successful |
| 2210 | 174.095759 | 36010.519005 | 71.239683 | successful |
| 2345 | 249.479612 | 28779.811595 | 73.950043 | successful |
train.describe(include='all')
| seconds_processing | MB_data_processed | dollars_run_cost | job_result | |
|---|---|---|---|---|
| count | 2530.000000 | 2530.000000 | 2530.000000 | 2530 |
| unique | NaN | NaN | NaN | 2 |
| top | NaN | NaN | NaN | successful |
| freq | NaN | NaN | NaN | 2036 |
| mean | 383.943816 | 167535.330528 | 88.449280 | NaN |
| std | 98.504249 | 116622.516698 | 11.193201 | NaN |
| min | 50.000000 | 1764.000000 | 54.000000 | NaN |
| 25% | 313.409072 | 81421.272499 | 80.627097 | NaN |
| 50% | 381.165255 | 137917.764965 | 87.632533 | NaN |
| 75% | 450.656159 | 227368.128957 | 95.676206 | NaN |
| max | 708.648018 | 833690.636892 | 135.378119 | NaN |
Target variable distribution
labels=train["job_result"].unique()
values=[train[train["job_result"] == label].shape[0] for label in labels]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()
Relations between explanatory variables
def scatter(df, x ,y):
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df[x],
y=df[y],
mode='markers',
)
)
fig.update_layout(
xaxis_title=x,
yaxis_title=y,
)
fig.show()
from itertools import combinations
variable_pairs = [comb for comb in combinations(['seconds_processing', 'MB_data_processed', 'dollars_run_cost'], 2)]
for pair in variable_pairs:
scatter(train, pair[0], pair[1])
Correlation matrix - exaplanatory variables
from pandas.api.types import is_numeric_dtype
def corr_plot(df):
num_cols = [
column
for column in df.columns
if is_numeric_dtype(df[column])
]
corr = df[num_cols].corr().round(2)
return px.imshow(
corr,
text_auto=True,
aspect="auto",
title="Correlation matrix",
)
corr_plot(train)
Explanatory variables vs target variable
def box_plot(df, main, group, color):
fig = px.box(
df,
x=group,
y=main,
color=color,
title=f"Box plot of {main}",
)
fig.show()
for var in ['seconds_processing', 'MB_data_processed', 'dollars_run_cost']:
box_plot(train, var, "job_result", None)
Main insights:
Y = train["job_result"]
X = train[train.columns.difference(['job_result'])]
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=101)
print("train set shape:", X_train.shape)
print("val set shape:", X_val.shape)
train set shape: (2024, 3) val set shape: (506, 3)
from sklearn.model_selection import GridSearchCV
def parameters_tuning(estimator, param_grid):
grid_search = GridSearchCV(
estimator=estimator,
param_grid=param_grid,
scoring='accuracy',
cv=3,
n_jobs=-1,
verbose=2,
)
grid_search.fit(X_train, Y_train)
print("Best params:", grid_search.best_params_)
return grid_search
Looking for the best set of hyperparameters
from sklearn.ensemble import RandomForestClassifier
param_grid_random_forest = {
'max_depth': [2, 3, 5, 10, 50],
'max_features': [None],
'min_samples_leaf': [2, 5, 10],
'min_samples_split': [2, 5, 10],
'n_estimators': [10, 50, 100, 500, 1000]
}
grid_search_random_forest = parameters_tuning(RandomForestClassifier(), param_grid_random_forest)
Fitting 3 folds for each of 225 candidates, totalling 675 fits
Best params: {'max_depth': 3, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}
Evaluating model trained on the best set of hyperparameters
from sklearn.metrics import classification_report
model_random_forest = RandomForestClassifier(**grid_search_random_forest.best_params_)
model_random_forest.fit(X_train, Y_train)
preds_random_forest = model_random_forest.predict(X_val)
print(classification_report(Y_val, preds_random_forest))
precision recall f1-score support
failed 0.33 0.05 0.08 85
successful 0.84 0.98 0.90 421
accuracy 0.82 506
macro avg 0.58 0.51 0.49 506
weighted avg 0.75 0.82 0.76 506
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model_random_forest, X_val, Y_val)
plt.show()
param_grid_random_forest_weighted = {
'max_depth': [2, 3, 5, 10, 50],
'max_features': [None],
'min_samples_leaf': [2, 5, 10],
'min_samples_split': [2, 5, 10],
'n_estimators': [10, 50, 100, 500, 1000],
'class_weight': ['balanced'],
}
grid_search_random_forest_weighted = parameters_tuning(RandomForestClassifier(), param_grid_random_forest_weighted)
Fitting 3 folds for each of 225 candidates, totalling 675 fits
Best params: {'class_weight': 'balanced', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
model_random_forest_weighted = RandomForestClassifier(**grid_search_random_forest_weighted.best_params_)
model_random_forest_weighted.fit(X_train, Y_train)
preds_random_forest_weighted = model_random_forest_weighted.predict(X_val)
print(classification_report(Y_val, preds_random_forest_weighted))
precision recall f1-score support
failed 0.33 0.21 0.26 85
successful 0.85 0.91 0.88 421
accuracy 0.79 506
macro avg 0.59 0.56 0.57 506
weighted avg 0.76 0.79 0.78 506
plot_confusion_matrix(model_random_forest_weighted, X_val, Y_val)
plt.show()
from xgboost import XGBClassifier
param_grid_xgboost = {
'learning_rate': [0.01, 0.1],
'max_depth': [2, 3, 5, 10, 50],
'min_child_weight ': [3, 4, 5],
'colsample_bytree': [0.5, 0.7],
'subsample ': [0.6, 0.75, 0.9],
'n_estimators': [10, 50, 100, 500, 1000]
}
grid_search_xgboost = parameters_tuning(XGBClassifier(), param_grid_xgboost)
Fitting 3 folds for each of 900 candidates, totalling 2700 fits
Best params: {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight ': 3, 'n_estimators': 10, 'subsample ': 0.6}
model_xgboost = XGBClassifier(**grid_search_xgboost.best_params_)
model_xgboost.fit(X_train, Y_train)
preds_xgboost = model_xgboost.predict(X_val)
print(classification_report(Y_val, preds_xgboost))
precision recall f1-score support
failed 0.25 0.01 0.02 85
successful 0.83 0.99 0.91 421
accuracy 0.83 506
macro avg 0.54 0.50 0.46 506
weighted avg 0.73 0.83 0.76 506
plot_confusion_matrix(model_xgboost, X_val, Y_val)
plt.show()
param_grid_xgboost_weighted = {
'learning_rate': [0.01, 0.1],
'max_depth': [2, 3, 5, 10, 50],
'min_child_weight ': [3, 4, 5],
'colsample_bytree': [0.5, 0.7],
'subsample ': [0.6, 0.75, 0.9],
'n_estimators': [10, 50, 100, 500, 1000],
'scale_pos_weight': [1, 25, 50, 80, 100],
}
grid_search_xgboost_weighted = parameters_tuning(XGBClassifier(), param_grid_xgboost_weighted)
Fitting 3 folds for each of 4500 candidates, totalling 13500 fits
Best params: {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight ': 3, 'n_estimators': 10, 'scale_pos_weight': 1, 'subsample ': 0.6}
model_xgboost_weighted = XGBClassifier(**grid_search_xgboost_weighted.best_params_)
model_xgboost_weighted.fit(X_train, Y_train)
preds_xgboost_weighted = model_xgboost_weighted.predict(X_val)
print(classification_report(Y_val, preds_xgboost_weighted))
precision recall f1-score support
failed 0.25 0.01 0.02 85
successful 0.83 0.99 0.91 421
accuracy 0.83 506
macro avg 0.54 0.50 0.46 506
weighted avg 0.73 0.83 0.76 506
plot_confusion_matrix(model_xgboost_weighted, X_val, Y_val)
plt.show()
X_test = test[test.columns.difference(['index'])]
result = pd.DataFrame()
result['index'] = test['index']
result["preds_random_forest"] = model_random_forest.predict(X_test)
result["preds_random_forest_weighted"] = model_random_forest_weighted.predict(X_test)
result["preds_xgboost"] = model_xgboost.predict(X_test)
result["preds_xgboost_weighted"] = model_xgboost_weighted.predict(X_test)
result.to_csv("result.csv", index=False)